library(leaps)
library(MASS)
library(ggplot2)
require(caTools)
## Loading required package: caTools

Data Summarizing

Pairwise Scatter Plots

  • Overall our data has many 27 columns, right away for our pairwise scatter plot we will not consider X, X.1, fips, county, and state in our summary statistics as X and X.1 were data left over indexes from merging columns fips is a unique value for each row that was used when merging and state is a categorical or qualititative variable that will be explored more with models.
data = read.csv('covid-project-raw-data/full_data_no_na.csv')
summary(data)
##       X.1               X               fips          county         
##  Min.   :   1.0   Min.   :   3.0   Min.   : 1001   Length:2944       
##  1st Qu.: 761.8   1st Qu.: 842.8   1st Qu.:18182   Class :character  
##  Median :1506.5   Median :1598.5   Median :29100   Mode  :character  
##  Mean   :1539.2   Mean   :1633.3   Mean   :30355                     
##  3rd Qu.:2330.2   3rd Qu.:2442.5   3rd Qu.:45319                     
##  Max.   :3109.0   Max.   :3234.0   Max.   :56045                     
##     state               cases              deaths          mask_score   
##  Length:2944        Min.   :    11.0   Min.   :   0.00   Min.   :1.433  
##  Class :character   1st Qu.:   469.5   1st Qu.:   6.00   1st Qu.:2.701  
##  Mode  :character   Median :  1101.5   Median :  17.00   Median :3.000  
##                     Mean   :  4073.6   Mean   :  78.48   Mean   :2.995  
##                     3rd Qu.:  2758.5   3rd Qu.:  48.00   3rd Qu.:3.296  
##                     Max.   :370729.0   Max.   :7446.00   Max.   :3.849  
##  ratio_pop_to_physician unemploy_rate      HS_grad_rate      flu_vacc_.    
##  Min.   :-3936          Min.   :0.01624   Min.   :0.3630   Min.   :0.0700  
##  1st Qu.: 1048          1st Qu.:0.03567   1st Qu.:0.8467   1st Qu.:0.3500  
##  Median : 1498          Median :0.04388   Median :0.8951   Median :0.4200  
##  Mean   : 1916          Mean   :0.04619   Mean   :0.8838   Mean   :0.4087  
##  3rd Qu.: 2202          3rd Qu.:0.05365   3rd Qu.:0.9344   3rd Qu.:0.4800  
##  Max.   :51307          Max.   :0.19066   Max.   :1.0000   Max.   :0.6500  
##    uninsured       median_income       pop_size           X._rural     
##  Min.   :0.02068   Min.   : 24783   Min.   :    1718   Min.   :0.0000  
##  1st Qu.:0.07036   1st Qu.: 42208   1st Qu.:   13074   1st Qu.:0.3222  
##  Median :0.10272   Median : 48728   Median :   28270   Median :0.5739  
##  Mean   :0.10937   Mean   : 50996   Mean   :  107303   Mean   :0.5678  
##  3rd Qu.:0.13841   3rd Qu.: 56546   3rd Qu.:   73641   3rd Qu.:0.8149  
##  Max.   :0.30877   Max.   :136191   Max.   :10163507   Max.   :1.0000  
##  life_expectancy non_hispanic_african_american american_indian   
##  Min.   :66.50   Min.   :0.0005099             Min.   :0.000496  
##  1st Qu.:75.49   1st Qu.:0.0073038             1st Qu.:0.003656  
##  Median :77.48   Median :0.0244137             Median :0.005952  
##  Mean   :77.40   Mean   :0.0925949             Mean   :0.019425  
##  3rd Qu.:79.22   3rd Qu.:0.1061252             3rd Qu.:0.012304  
##  Max.   :97.97   Max.   :0.8532961             Max.   :0.879781  
##      asian          native_hawaiian        hispanic        non_hispanic_white
##  Min.   :0.000000   Min.   :0.0000000   Min.   :0.005151   Min.   :0.0276    
##  1st Qu.:0.004604   1st Qu.:0.0003171   1st Qu.:0.022798   1st Qu.:0.6470    
##  Median :0.007189   Median :0.0005949   Median :0.042757   Median :0.8371    
##  Mean   :0.015009   Mean   :0.0012039   Mean   :0.094767   Mean   :0.7647    
##  3rd Qu.:0.014113   3rd Qu.:0.0011243   3rd Qu.:0.096909   3rd Qu.:0.9250    
##  Max.   :0.430067   Max.   :0.1312001   Max.   :0.963230   Max.   :0.9792    
##  sim_diversity_index  X._democrat           area          pop_density      
##  Min.   :0.04102     Min.   :0.06955   Min.   :    2.0   Min.   :    0.50  
##  1st Qu.:0.14154     1st Qu.:0.21860   1st Qu.:  426.7   1st Qu.:   20.98  
##  Median :0.28553     Median :0.30358   Median :  599.9   Median :   48.40  
##  Mean   :0.31371     Mean   :0.33789   Mean   :  946.6   Mean   :  214.13  
##  3rd Qu.:0.48660     3rd Qu.:0.42621   3rd Qu.:  900.2   3rd Qu.:  123.85  
##  Max.   :0.82897     Max.   :0.95695   Max.   :20056.9   Max.   :17179.10
head(data)
##   X.1 X fips  county   state cases deaths mask_score ratio_pop_to_physician
## 1   1 3 1001 Autauga Alabama  2634     39      3.003               3264.941
## 2   2 4 1003 Baldwin Alabama  8269     84      2.968               1915.568
## 3   3 5 1005 Barbour Alabama  1161     10      2.928               4211.667
## 4   4 6 1007    Bibb Alabama  1142     17      3.348               1079.429
## 5   5 7 1009  Blount Alabama  2763     36      2.892               5273.909
## 6   6 8 1011 Bullock Alabama   690     19      3.186               1288.625
##   unemploy_rate HS_grad_rate flu_vacc_.  uninsured median_income pop_size
## 1    0.03863522    0.9000000       0.41 0.08500966         58343    55504
## 2    0.03988336    0.8636158       0.45 0.10699288         56607   212628
## 3    0.05900923    0.8141026       0.37 0.12513197         32490    25270
## 4    0.04385140    0.8376384       0.39 0.09680075         45795    22668
## 5    0.04021393    0.9346880       0.38 0.12114004         48253    58013
## 6    0.04925187    0.7024793       0.26 0.13206483         29113    10309
##    X._rural life_expectancy non_hispanic_african_american american_indian
## 1 0.4200216        76.33059                    0.19254468     0.004756414
## 2 0.4227910        78.59950                    0.08953195     0.007760032
## 3 0.6778963        75.77946                    0.47942224     0.006529482
## 4 0.6835261        73.92827                    0.21457561     0.004279160
## 5 0.8995150        74.59777                    0.01460018     0.006326168
## 6 0.5137438        73.11753                    0.69162867     0.008439228
##         asian native_hawaiian   hispanic non_hispanic_white sim_diversity_index
## 1 0.012791871    0.0010449697 0.02857452          0.7447391           0.4072863
## 2 0.011564799    0.0006866452 0.04550200          0.8304739           0.3000323
## 3 0.004629996    0.0018599129 0.04206569          0.4595568           0.5571248
## 4 0.002205753    0.0011469914 0.02638080          0.7429857           0.4012091
## 5 0.003016565    0.0011721511 0.09565097          0.8694431           0.2346560
## 6 0.001940052    0.0077602095 0.08245223          0.2130178           0.4693396
##   X._democrat    area pop_density
## 1  0.24622532  594.44        91.8
## 2  0.20207793 1589.78       114.6
## 3  0.47176755  884.88        31.0
## 4  0.21760334  622.58        36.8
## 5  0.08618829  644.78        88.9
## 6  0.75588865  622.81        17.5
drops = c('X', 'X.1','fips', 'county', 'state')
plot(data[ , !(names(data) %in% drops)][1:10])

colnames(data[ , !(names(data) %in% drops)])
##  [1] "cases"                         "deaths"                       
##  [3] "mask_score"                    "ratio_pop_to_physician"       
##  [5] "unemploy_rate"                 "HS_grad_rate"                 
##  [7] "flu_vacc_."                    "uninsured"                    
##  [9] "median_income"                 "pop_size"                     
## [11] "X._rural"                      "life_expectancy"              
## [13] "non_hispanic_african_american" "american_indian"              
## [15] "asian"                         "native_hawaiian"              
## [17] "hispanic"                      "non_hispanic_white"           
## [19] "sim_diversity_index"           "X._democrat"                  
## [21] "area"                          "pop_density"
plot(data[ , !(names(data) %in% drops)][11:22])

summary(data[ , !(names(data) %in% drops)][1:22])
##      cases              deaths          mask_score    ratio_pop_to_physician
##  Min.   :    11.0   Min.   :   0.00   Min.   :1.433   Min.   :-3936         
##  1st Qu.:   469.5   1st Qu.:   6.00   1st Qu.:2.701   1st Qu.: 1048         
##  Median :  1101.5   Median :  17.00   Median :3.000   Median : 1498         
##  Mean   :  4073.6   Mean   :  78.48   Mean   :2.995   Mean   : 1916         
##  3rd Qu.:  2758.5   3rd Qu.:  48.00   3rd Qu.:3.296   3rd Qu.: 2202         
##  Max.   :370729.0   Max.   :7446.00   Max.   :3.849   Max.   :51307         
##  unemploy_rate      HS_grad_rate      flu_vacc_.       uninsured      
##  Min.   :0.01624   Min.   :0.3630   Min.   :0.0700   Min.   :0.02068  
##  1st Qu.:0.03567   1st Qu.:0.8467   1st Qu.:0.3500   1st Qu.:0.07036  
##  Median :0.04388   Median :0.8951   Median :0.4200   Median :0.10272  
##  Mean   :0.04619   Mean   :0.8838   Mean   :0.4087   Mean   :0.10937  
##  3rd Qu.:0.05365   3rd Qu.:0.9344   3rd Qu.:0.4800   3rd Qu.:0.13841  
##  Max.   :0.19066   Max.   :1.0000   Max.   :0.6500   Max.   :0.30877  
##  median_income       pop_size           X._rural      life_expectancy
##  Min.   : 24783   Min.   :    1718   Min.   :0.0000   Min.   :66.50  
##  1st Qu.: 42208   1st Qu.:   13074   1st Qu.:0.3222   1st Qu.:75.49  
##  Median : 48728   Median :   28270   Median :0.5739   Median :77.48  
##  Mean   : 50996   Mean   :  107303   Mean   :0.5678   Mean   :77.40  
##  3rd Qu.: 56546   3rd Qu.:   73641   3rd Qu.:0.8149   3rd Qu.:79.22  
##  Max.   :136191   Max.   :10163507   Max.   :1.0000   Max.   :97.97  
##  non_hispanic_african_american american_indian        asian         
##  Min.   :0.0005099             Min.   :0.000496   Min.   :0.000000  
##  1st Qu.:0.0073038             1st Qu.:0.003656   1st Qu.:0.004604  
##  Median :0.0244137             Median :0.005952   Median :0.007189  
##  Mean   :0.0925949             Mean   :0.019425   Mean   :0.015009  
##  3rd Qu.:0.1061252             3rd Qu.:0.012304   3rd Qu.:0.014113  
##  Max.   :0.8532961             Max.   :0.879781   Max.   :0.430067  
##  native_hawaiian        hispanic        non_hispanic_white sim_diversity_index
##  Min.   :0.0000000   Min.   :0.005151   Min.   :0.0276     Min.   :0.04102    
##  1st Qu.:0.0003171   1st Qu.:0.022798   1st Qu.:0.6470     1st Qu.:0.14154    
##  Median :0.0005949   Median :0.042757   Median :0.8371     Median :0.28553    
##  Mean   :0.0012039   Mean   :0.094767   Mean   :0.7647     Mean   :0.31371    
##  3rd Qu.:0.0011243   3rd Qu.:0.096909   3rd Qu.:0.9250     3rd Qu.:0.48660    
##  Max.   :0.1312001   Max.   :0.963230   Max.   :0.9792     Max.   :0.82897    
##   X._democrat           area          pop_density      
##  Min.   :0.06955   Min.   :    2.0   Min.   :    0.50  
##  1st Qu.:0.21860   1st Qu.:  426.7   1st Qu.:   20.98  
##  Median :0.30358   Median :  599.9   Median :   48.40  
##  Mean   :0.33789   Mean   :  946.6   Mean   :  214.13  
##  3rd Qu.:0.42621   3rd Qu.:  900.2   3rd Qu.:  123.85  
##  Max.   :0.95695   Max.   :20056.9   Max.   :17179.10
#testing
  • In addition to get a clear send of the distribution of our data a boxplot is created for all of the quantitative variables.
library(reshape2)
drops = c('X', 'X.1', 'county', 'state')

df.m <- melt(data[ , !(names(data) %in% drops)], id.var = "fips")

p <- ggplot(data = df.m, aes(x=variable, y=value)) + 
             geom_boxplot(aes(fill=fips))
p + facet_wrap( ~ variable, scales="free")

Qualitative/Quantitative variables summary in our data

sapply(data, class)
##                           X.1                             X 
##                     "integer"                     "integer" 
##                          fips                        county 
##                     "integer"                   "character" 
##                         state                         cases 
##                   "character"                     "integer" 
##                        deaths                    mask_score 
##                     "integer"                     "numeric" 
##        ratio_pop_to_physician                 unemploy_rate 
##                     "numeric"                     "numeric" 
##                  HS_grad_rate                    flu_vacc_. 
##                     "numeric"                     "numeric" 
##                     uninsured                 median_income 
##                     "numeric"                     "integer" 
##                      pop_size                      X._rural 
##                     "integer"                     "numeric" 
##               life_expectancy non_hispanic_african_american 
##                     "numeric"                     "numeric" 
##               american_indian                         asian 
##                     "numeric"                     "numeric" 
##               native_hawaiian                      hispanic 
##                     "numeric"                     "numeric" 
##            non_hispanic_white           sim_diversity_index 
##                     "numeric"                     "numeric" 
##                   X._democrat                          area 
##                     "numeric"                     "numeric" 
##                   pop_density 
##                     "numeric"

Model Building

Basic model

  • To begin getting an understanding for the entries in our dataset with relation to building a linear model, a simple mode with all factors from the data included is constructed.
basic_model = lm(cases ~ state + mask_score + ratio_pop_to_physician + unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + pop_size + X._rural + life_expectancy + sim_diversity_index + area + X._democrat + life_expectancy + pop_density + non_hispanic_african_american + american_indian + asian + native_hawaiian + hispanic + non_hispanic_white, data = data)
summary(basic_model)
## 
## Call:
## lm(formula = cases ~ state + mask_score + ratio_pop_to_physician + 
##     unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + 
##     pop_size + X._rural + life_expectancy + sim_diversity_index + 
##     area + X._democrat + life_expectancy + pop_density + non_hispanic_african_american + 
##     american_indian + asian + native_hawaiian + hispanic + non_hispanic_white, 
##     data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -51341   -785    -93    602  97809 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   -1.080e+04  1.165e+04  -0.927 0.353809    
## stateAlaska                    8.885e+03  3.975e+03   2.235 0.025486 *  
## stateArizona                  -2.149e+03  1.261e+03  -1.704 0.088577 .  
## stateArkansas                 -2.618e+02  6.513e+02  -0.402 0.687721    
## stateCalifornia               -6.742e+03  8.667e+02  -7.779 1.01e-14 ***
## stateColorado                 -2.356e+03  7.786e+02  -3.026 0.002498 ** 
## stateConnecticut              -5.054e+03  1.460e+03  -3.462 0.000543 ***
## stateDelaware                 -2.943e+03  2.227e+03  -1.322 0.186430    
## stateDistrict of Columbia     -1.513e+04  3.945e+03  -3.835 0.000128 ***
## stateFlorida                  -1.777e+02  7.022e+02  -0.253 0.800208    
## stateGeorgia                  -6.235e+02  5.860e+02  -1.064 0.287376    
## stateHawaii                    1.477e+04  5.370e+03   2.751 0.005983 ** 
## stateIdaho                    -5.391e+02  8.546e+02  -0.631 0.528218    
## stateIllinois                  9.397e+02  6.744e+02   1.393 0.163595    
## stateIndiana                   2.517e+02  6.473e+02   0.389 0.697448    
## stateIowa                      8.015e+02  7.142e+02   1.122 0.261883    
## stateKansas                   -3.285e+02  6.405e+02  -0.513 0.608097    
## stateKentucky                 -4.557e+02  6.510e+02  -0.700 0.484049    
## stateLouisiana                -5.453e+02  6.690e+02  -0.815 0.415119    
## stateMaine                    -2.890e+03  1.118e+03  -2.586 0.009757 ** 
## stateMaryland                 -2.208e+03  9.302e+02  -2.374 0.017649 *  
## stateMassachusetts            -5.768e+03  1.214e+03  -4.752 2.11e-06 ***
## stateMichigan                 -1.620e+03  7.128e+02  -2.272 0.023160 *  
## stateMinnesota                 5.050e+02  7.425e+02   0.680 0.496483    
## stateMississippi              -5.269e+02  6.537e+02  -0.806 0.420323    
## stateMissouri                  6.647e+01  6.269e+02   0.106 0.915559    
## stateMontana                  -1.515e+02  8.116e+02  -0.187 0.851900    
## stateNebraska                 -1.089e+02  6.916e+02  -0.157 0.874924    
## stateNevada                    2.408e+02  1.219e+03   0.198 0.843393    
## stateNew Hampshire            -3.043e+03  1.331e+03  -2.286 0.022356 *  
## stateNew Jersey               -1.387e+03  1.016e+03  -1.365 0.172391    
## stateNew Mexico               -4.764e+03  9.931e+02  -4.797 1.70e-06 ***
## stateNew York                 -3.254e+03  7.825e+02  -4.159 3.29e-05 ***
## stateNorth Carolina           -1.875e+03  6.217e+02  -3.016 0.002581 ** 
## stateNorth Dakota              2.445e+03  1.209e+03   2.022 0.043275 *  
## stateOhio                     -1.617e+03  6.754e+02  -2.395 0.016688 *  
## stateOklahoma                 -5.249e+02  8.601e+02  -0.610 0.541741    
## stateOregon                   -3.933e+03  8.976e+02  -4.382 1.22e-05 ***
## statePennsylvania             -3.744e+03  7.228e+02  -5.180 2.37e-07 ***
## stateRhode Island             -1.702e+03  1.798e+03  -0.947 0.343865    
## stateSouth Carolina           -8.345e+02  7.260e+02  -1.149 0.250522    
## stateSouth Dakota              4.649e+02  7.560e+02   0.615 0.538583    
## stateTennessee                 4.004e+02  6.255e+02   0.640 0.522204    
## stateTexas                    -2.663e+03  6.530e+02  -4.077 4.68e-05 ***
## stateUtah                      1.155e+03  9.219e+02   1.253 0.210483    
## stateVermont                  -1.796e+03  1.297e+03  -1.385 0.166300    
## stateVirginia                 -1.436e+03  5.980e+02  -2.401 0.016423 *  
## stateWashington               -4.189e+03  8.734e+02  -4.796 1.70e-06 ***
## stateWest Virginia            -1.029e+03  7.512e+02  -1.370 0.170818    
## stateWisconsin                 1.853e+03  7.396e+02   2.505 0.012303 *  
## stateWyoming                  -5.560e+02  1.011e+03  -0.550 0.582271    
## mask_score                     4.137e+02  3.107e+02   1.332 0.183092    
## ratio_pop_to_physician        -5.566e-02  3.973e-02  -1.401 0.161365    
## unemploy_rate                  1.051e+04  7.019e+03   1.498 0.134359    
## HS_grad_rate                  -1.043e+03  1.315e+03  -0.793 0.427626    
## flu_vacc_.                     1.347e+03  1.026e+03   1.314 0.188988    
## uninsured                     -1.149e+03  4.343e+03  -0.265 0.791294    
## median_income                 -2.338e-02  1.005e-02  -2.326 0.020098 *  
## pop_size                       4.057e-02  2.596e-04 156.305  < 2e-16 ***
## X._rural                      -9.170e+02  3.648e+02  -2.514 0.011986 *  
## life_expectancy                1.352e+02  4.162e+01   3.247 0.001178 ** 
## sim_diversity_index            1.863e+03  9.891e+02   1.884 0.059715 .  
## area                           1.805e-02  7.895e-02   0.229 0.819143    
## X._democrat                   -3.167e+02  1.052e+03  -0.301 0.763428    
## pop_density                    7.973e-01  1.220e-01   6.533 7.59e-11 ***
## non_hispanic_african_american  1.360e+03  1.114e+04   0.122 0.902878    
## american_indian                1.655e+03  1.133e+04   0.146 0.883938    
## asian                         -7.723e+04  1.236e+04  -6.247 4.79e-10 ***
## native_hawaiian               -3.299e+04  4.739e+04  -0.696 0.486302    
## hispanic                       8.683e+03  1.087e+04   0.799 0.424328    
## non_hispanic_white             1.560e+03  1.148e+04   0.136 0.891904    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3741 on 2873 degrees of freedom
## Multiple R-squared:  0.9262, Adjusted R-squared:  0.9244 
## F-statistic: 514.8 on 70 and 2873 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(basic_model)
## Warning: not plotting observations with leverage one:
##   68, 283

boxcox(basic_model)
hist(data$cases, breaks = 200)
hist(log(data$cases), breaks = 200)

  • Analysis for this portion is included in the appendix 2, which shows that the basic model is very poor which is further revealed by the fact that the Normal Q-Q plot is very poor and many points surpass Cook’s distance. This basic model was followed up with a boxcox plot which “computes and optionally plots profile log-likelihoods for the parameter of the Box-Cox power transformation”. The boxcox call in R suggests that a log transformation of the variable cases is needed. The same basic model with the transformed cases response variable was then computed to understand the improvements to the model.
log_model = lm(log(cases) ~  state + mask_score + ratio_pop_to_physician + unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + pop_size + X._rural + life_expectancy + sim_diversity_index + area + X._democrat + life_expectancy + pop_density + non_hispanic_african_american + american_indian + asian + native_hawaiian + hispanic + non_hispanic_white, data = data)
summary(log_model)
## 
## Call:
## lm(formula = log(cases) ~ state + mask_score + ratio_pop_to_physician + 
##     unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + 
##     pop_size + X._rural + life_expectancy + sim_diversity_index + 
##     area + X._democrat + life_expectancy + pop_density + non_hispanic_african_american + 
##     american_indian + asian + native_hawaiian + hispanic + non_hispanic_white, 
##     data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.2257 -0.3917  0.0555  0.4466  2.6514 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    7.511e+00  2.161e+00   3.476 0.000517 ***
## stateAlaska                   -1.724e+00  7.376e-01  -2.337 0.019524 *  
## stateArizona                  -1.229e+00  2.340e-01  -5.254 1.60e-07 ***
## stateArkansas                 -6.024e-01  1.208e-01  -4.985 6.55e-07 ***
## stateCalifornia               -1.858e+00  1.608e-01 -11.554  < 2e-16 ***
## stateColorado                 -1.810e+00  1.445e-01 -12.531  < 2e-16 ***
## stateConnecticut              -8.421e-01  2.708e-01  -3.109 0.001896 ** 
## stateDelaware                 -4.424e-01  4.132e-01  -1.071 0.284382    
## stateDistrict of Columbia     -9.940e-01  7.320e-01  -1.358 0.174600    
## stateFlorida                  -2.057e-01  1.303e-01  -1.579 0.114520    
## stateGeorgia                  -8.059e-01  1.087e-01  -7.413 1.62e-13 ***
## stateHawaii                   -7.262e+00  9.964e-01  -7.289 4.03e-13 ***
## stateIdaho                    -9.190e-01  1.586e-01  -5.796 7.53e-09 ***
## stateIllinois                 -6.896e-01  1.251e-01  -5.511 3.88e-08 ***
## stateIndiana                  -5.565e-01  1.201e-01  -4.633 3.77e-06 ***
## stateIowa                     -6.498e-01  1.325e-01  -4.904 9.94e-07 ***
## stateKansas                   -1.307e+00  1.189e-01 -10.994  < 2e-16 ***
## stateKentucky                 -7.489e-01  1.208e-01  -6.199 6.49e-10 ***
## stateLouisiana                -4.306e-01  1.241e-01  -3.469 0.000531 ***
## stateMaine                    -1.777e+00  2.074e-01  -8.570  < 2e-16 ***
## stateMaryland                 -1.154e+00  1.726e-01  -6.686 2.75e-11 ***
## stateMassachusetts            -1.356e+00  2.252e-01  -6.022 1.94e-09 ***
## stateMichigan                 -5.552e-01  1.323e-01  -4.198 2.78e-05 ***
## stateMinnesota                -6.922e-01  1.378e-01  -5.025 5.35e-07 ***
## stateMississippi              -3.192e-01  1.213e-01  -2.632 0.008541 ** 
## stateMissouri                 -4.687e-01  1.163e-01  -4.030 5.73e-05 ***
## stateMontana                  -1.206e+00  1.506e-01  -8.007 1.70e-15 ***
## stateNebraska                 -1.143e+00  1.283e-01  -8.904  < 2e-16 ***
## stateNevada                   -2.834e+00  2.261e-01 -12.531  < 2e-16 ***
## stateNew Hampshire            -1.453e+00  2.470e-01  -5.881 4.54e-09 ***
## stateNew Jersey               -1.034e+00  1.885e-01  -5.485 4.50e-08 ***
## stateNew Mexico               -2.012e+00  1.843e-01 -10.916  < 2e-16 ***
## stateNew York                 -1.367e+00  1.452e-01  -9.414  < 2e-16 ***
## stateNorth Carolina           -6.134e-01  1.154e-01  -5.318 1.13e-07 ***
## stateNorth Dakota             -6.513e-01  2.243e-01  -2.903 0.003725 ** 
## stateOhio                     -5.446e-01  1.253e-01  -4.346 1.44e-05 ***
## stateOklahoma                 -8.593e-01  1.596e-01  -5.384 7.86e-08 ***
## stateOregon                   -2.186e+00  1.665e-01 -13.126  < 2e-16 ***
## statePennsylvania             -8.953e-01  1.341e-01  -6.675 2.95e-11 ***
## stateRhode Island             -1.505e+00  3.337e-01  -4.510 6.75e-06 ***
## stateSouth Carolina           -2.000e-01  1.347e-01  -1.485 0.137733    
## stateSouth Dakota             -9.209e-01  1.403e-01  -6.565 6.14e-11 ***
## stateTennessee                -1.426e-01  1.161e-01  -1.228 0.219410    
## stateTexas                    -1.471e+00  1.212e-01 -12.137  < 2e-16 ***
## stateUtah                     -1.655e+00  1.710e-01  -9.677  < 2e-16 ***
## stateVermont                  -2.112e+00  2.407e-01  -8.776  < 2e-16 ***
## stateVirginia                 -1.609e+00  1.110e-01 -14.505  < 2e-16 ***
## stateWashington               -1.778e+00  1.621e-01 -10.971  < 2e-16 ***
## stateWest Virginia            -1.166e+00  1.394e-01  -8.362  < 2e-16 ***
## stateWisconsin                -1.899e-02  1.372e-01  -0.138 0.889970    
## stateWyoming                  -1.720e+00  1.875e-01  -9.174  < 2e-16 ***
## mask_score                     5.334e-02  5.764e-02   0.925 0.354810    
## ratio_pop_to_physician        -2.397e-05  7.372e-06  -3.251 0.001162 ** 
## unemploy_rate                  2.294e+00  1.302e+00   1.761 0.078340 .  
## HS_grad_rate                  -6.235e-01  2.439e-01  -2.556 0.010635 *  
## flu_vacc_.                     2.497e+00  1.903e-01  13.122  < 2e-16 ***
## uninsured                      9.895e-01  8.059e-01   1.228 0.219607    
## median_income                  1.675e-05  1.866e-06   8.980  < 2e-16 ***
## pop_size                       1.036e-06  4.816e-08  21.504  < 2e-16 ***
## X._rural                      -2.290e+00  6.768e-02 -33.832  < 2e-16 ***
## life_expectancy               -3.685e-02  7.723e-03  -4.772 1.92e-06 ***
## sim_diversity_index            8.284e-01  1.835e-01   4.514 6.63e-06 ***
## area                           9.784e-05  1.465e-05   6.679 2.88e-11 ***
## X._democrat                    1.641e-01  1.952e-01   0.841 0.400635    
## pop_density                    3.744e-05  2.264e-05   1.653 0.098342 .  
## non_hispanic_african_american  2.022e+00  2.067e+00   0.978 0.327970    
## american_indian                2.617e+00  2.103e+00   1.244 0.213423    
## asian                          3.576e+00  2.294e+00   1.559 0.119125    
## native_hawaiian                4.570e+01  8.792e+00   5.198 2.16e-07 ***
## hispanic                       2.723e+00  2.016e+00   1.351 0.176946    
## non_hispanic_white             2.457e+00  2.130e+00   1.153 0.248818    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6942 on 2873 degrees of freedom
## Multiple R-squared:  0.773,  Adjusted R-squared:  0.7674 
## F-statistic: 139.7 on 70 and 2873 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(log_model)
## Warning: not plotting observations with leverage one:
##   68, 283

- The log transformation of our response variable, cases, produces a much better Normal Q-Q but it is still very heavy tailed on the negative side of the standardized residuals as seen in the code and output in appendix 2. Though, the model has shown improvement with these steps we see one of the main predictors in the model appears to be population size or pop_size which is not very informative in the final model that would be useful in constructing. Therefore, it is reasonable to consider normalizing the total cases based on population size.

Building a simple model with cases normalized by population size.

data_norm = data
data_norm$cases_norm = data_norm$cases/data_norm$pop_size
norm_model = lm(cases_norm ~ state + mask_score + ratio_pop_to_physician + unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + X._rural + life_expectancy + sim_diversity_index + area + X._democrat + life_expectancy + pop_density + non_hispanic_african_american + american_indian + asian + native_hawaiian + hispanic + non_hispanic_white, data = data_norm)

# setting this up for usage later
set.seed(10)
sample = sample.split(data_norm,SplitRatio = 0.5)
data.t = subset(data_norm,sample ==TRUE)
data.v = subset(data_norm,sample ==FALSE)

boxcox(norm_model)

par(mfrow = c(2,2))
plot(norm_model)
## Warning: not plotting observations with leverage one:
##   68, 283

hist(data_norm$cases_norm, breaks = 200)
hist(sqrt(data_norm$cases_norm), breaks = 200)

  • After normalizing for population size the BoxCox plot to see if a transformation should be done to the response variable, covid cases. The results now suggests to perform a square root transformation of the response variable covid cases as a \[\lambda\] of 0.5 was obtained.

### Creating a simple model based on the sqrt(cases_norm) as the predictor variable.

sqrt_norm_model = lm(sqrt(cases_norm) ~  state + mask_score + ratio_pop_to_physician + unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + X._rural + life_expectancy + area + X._democrat + sim_diversity_index + life_expectancy + pop_density + non_hispanic_african_american + american_indian + asian + native_hawaiian + hispanic + non_hispanic_white, data = data_norm)
summary(sqrt_norm_model)
## 
## Call:
## lm(formula = sqrt(cases_norm) ~ state + mask_score + ratio_pop_to_physician + 
##     unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + 
##     X._rural + life_expectancy + area + X._democrat + sim_diversity_index + 
##     life_expectancy + pop_density + non_hispanic_african_american + 
##     american_indian + asian + native_hawaiian + hispanic + non_hispanic_white, 
##     data = data_norm)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.107070 -0.018932 -0.001631  0.016028  0.223177 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   -5.621e-01  9.794e-02  -5.739 1.05e-08 ***
## stateAlaska                    4.076e-02  3.342e-02   1.219 0.222755    
## stateArizona                  -4.093e-02  1.061e-02  -3.859 0.000116 ***
## stateArkansas                 -1.361e-06  5.477e-03   0.000 0.999802    
## stateCalifornia               -7.600e-02  7.283e-03 -10.435  < 2e-16 ***
## stateColorado                 -5.367e-02  6.546e-03  -8.198 3.63e-16 ***
## stateConnecticut              -3.819e-02  1.227e-02  -3.112 0.001875 ** 
## stateDelaware                 -2.613e-02  1.873e-02  -1.395 0.163018    
## stateDistrict of Columbia     -3.668e-02  3.316e-02  -1.106 0.268736    
## stateFlorida                  -2.485e-03  5.899e-03  -0.421 0.673592    
## stateGeorgia                  -1.353e-02  4.926e-03  -2.746 0.006078 ** 
## stateHawaii                   -1.361e-01  4.506e-02  -3.019 0.002554 ** 
## stateIdaho                     4.181e-03  7.186e-03   0.582 0.560767    
## stateIllinois                  2.428e-02  5.668e-03   4.284 1.89e-05 ***
## stateIndiana                   3.798e-03  5.444e-03   0.698 0.485448    
## stateIowa                      5.223e-02  6.006e-03   8.695  < 2e-16 ***
## stateKansas                    1.782e-02  5.387e-03   3.309 0.000948 ***
## stateKentucky                 -1.444e-02  5.473e-03  -2.638 0.008390 ** 
## stateLouisiana                 1.835e-03  5.626e-03   0.326 0.744296    
## stateMaine                    -9.074e-02  9.397e-03  -9.656  < 2e-16 ***
## stateMaryland                 -4.063e-02  7.821e-03  -5.195 2.19e-07 ***
## stateMassachusetts            -3.433e-02  1.021e-02  -3.363 0.000780 ***
## stateMichigan                 -8.421e-03  5.992e-03  -1.405 0.159996    
## stateMinnesota                 2.734e-02  6.244e-03   4.380 1.23e-05 ***
## stateMississippi               2.373e-03  5.497e-03   0.432 0.665986    
## stateMissouri                  9.972e-03  5.271e-03   1.892 0.058612 .  
## stateMontana                   2.899e-02  6.825e-03   4.248 2.23e-05 ***
## stateNebraska                  1.524e-02  5.816e-03   2.621 0.008815 ** 
## stateNevada                   -6.629e-02  1.023e-02  -6.477 1.10e-10 ***
## stateNew Hampshire            -7.590e-02  1.120e-02  -6.779 1.46e-11 ***
## stateNew Jersey               -4.059e-02  8.532e-03  -4.757 2.06e-06 ***
## stateNew Mexico               -8.404e-02  8.344e-03 -10.072  < 2e-16 ***
## stateNew York                 -6.359e-02  6.580e-03  -9.664  < 2e-16 ***
## stateNorth Carolina           -3.303e-02  5.228e-03  -6.318 3.06e-10 ***
## stateNorth Dakota              8.842e-02  1.017e-02   8.696  < 2e-16 ***
## stateOhio                     -2.265e-02  5.676e-03  -3.990 6.77e-05 ***
## stateOklahoma                  1.759e-02  7.233e-03   2.431 0.015098 *  
## stateOregon                   -7.091e-02  7.546e-03  -9.397  < 2e-16 ***
## statePennsylvania             -5.237e-02  6.076e-03  -8.618  < 2e-16 ***
## stateRhode Island             -2.829e-02  1.512e-02  -1.871 0.061453 .  
## stateSouth Carolina           -2.219e-02  6.106e-03  -3.634 0.000284 ***
## stateSouth Dakota              7.617e-02  6.357e-03  11.982  < 2e-16 ***
## stateTennessee                 2.359e-02  5.260e-03   4.484 7.60e-06 ***
## stateTexas                    -5.743e-02  5.488e-03 -10.465  < 2e-16 ***
## stateUtah                     -2.416e-02  7.752e-03  -3.116 0.001849 ** 
## stateVermont                  -8.616e-02  1.091e-02  -7.899 3.98e-15 ***
## stateVirginia                 -4.222e-02  5.018e-03  -8.413  < 2e-16 ***
## stateWashington               -5.640e-02  7.345e-03  -7.679 2.18e-14 ***
## stateWest Virginia            -5.279e-02  6.315e-03  -8.360  < 2e-16 ***
## stateWisconsin                 5.752e-02  6.220e-03   9.248  < 2e-16 ***
## stateWyoming                  -6.723e-04  8.493e-03  -0.079 0.936912    
## mask_score                    -1.333e-02  2.609e-03  -5.108 3.47e-07 ***
## ratio_pop_to_physician        -8.594e-07  3.341e-07  -2.572 0.010149 *  
## unemploy_rate                 -2.469e-01  5.898e-02  -4.186 2.92e-05 ***
## HS_grad_rate                   1.572e-02  1.105e-02   1.423 0.154904    
## flu_vacc_.                     3.769e-02  8.624e-03   4.370 1.28e-05 ***
## uninsured                     -6.465e-02  3.650e-02  -1.771 0.076633 .  
## median_income                 -3.500e-07  8.454e-08  -4.141 3.56e-05 ***
## X._rural                      -2.178e-02  3.062e-03  -7.111 1.44e-12 ***
## life_expectancy                1.014e-05  3.500e-04   0.029 0.976895    
## area                           1.670e-07  6.569e-07   0.254 0.799282    
## X._democrat                   -8.072e-02  8.845e-03  -9.126  < 2e-16 ***
## sim_diversity_index            1.304e-02  8.318e-03   1.567 0.117126    
## pop_density                    7.062e-07  1.009e-06   0.700 0.484073    
## non_hispanic_african_american  9.581e-01  9.368e-02  10.228  < 2e-16 ***
## american_indian                9.632e-01  9.529e-02  10.108  < 2e-16 ***
## asian                          9.351e-01  1.037e-01   9.020  < 2e-16 ***
## native_hawaiian                2.249e+00  3.985e-01   5.645 1.81e-08 ***
## hispanic                       9.772e-01  9.137e-02  10.694  < 2e-16 ***
## non_hispanic_white             8.347e-01  9.654e-02   8.647  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03146 on 2874 degrees of freedom
## Multiple R-squared:  0.6574, Adjusted R-squared:  0.6492 
## F-statistic: 79.94 on 69 and 2874 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(sqrt_norm_model)
## Warning: not plotting observations with leverage one:
##   68, 283

- After transforming the covid cased normalized by population size the model Q-Q plot is not great…… Now that basic model exploration has been conducted more advanced selection techniques will be use such as regsubsets in R as well as stepAIC. Regsubsets was performed using nvmax=10 and nbest=6 represent the maximum size of subsets to be examined and the number of subsets of each size to record, respectively. Regsubsets was performed using forward and backward stepwise search since the exhaustive search with second order interactions was deemed slow by the leaps library in R.

sqrt_norm_model = lm(sqrt(cases_norm) ~  mask_score + ratio_pop_to_physician + unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + X._rural + life_expectancy + area + X._democrat + sim_diversity_index + life_expectancy + pop_density + non_hispanic_african_american + american_indian + asian + native_hawaiian + hispanic + non_hispanic_white, data = data_norm)
summary(sqrt_norm_model)
## 
## Call:
## lm(formula = sqrt(cases_norm) ~ mask_score + ratio_pop_to_physician + 
##     unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + 
##     X._rural + life_expectancy + area + X._democrat + sim_diversity_index + 
##     life_expectancy + pop_density + non_hispanic_african_american + 
##     american_indian + asian + native_hawaiian + hispanic + non_hispanic_white, 
##     data = data_norm)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.145860 -0.025914 -0.001729  0.023545  0.243744 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   -4.430e-01  9.467e-02  -4.679 3.01e-06 ***
## mask_score                    -6.594e-02  2.525e-03 -26.115  < 2e-16 ***
## ratio_pop_to_physician        -1.196e-06  4.217e-07  -2.836  0.00461 ** 
## unemploy_rate                 -6.371e-01  6.406e-02  -9.946  < 2e-16 ***
## HS_grad_rate                   2.301e-02  1.244e-02   1.850  0.06443 .  
## flu_vacc_.                     5.001e-02  1.005e-02   4.975 6.92e-07 ***
## uninsured                     -1.500e-01  2.592e-02  -5.788 7.89e-09 ***
## median_income                 -4.417e-07  9.907e-08  -4.458 8.58e-06 ***
## X._rural                      -3.022e-02  3.651e-03  -8.277  < 2e-16 ***
## life_expectancy                1.710e-03  4.031e-04   4.242 2.29e-05 ***
## area                          -3.602e-06  6.563e-07  -5.487 4.43e-08 ***
## X._democrat                   -6.849e-02  9.510e-03  -7.202 7.53e-13 ***
## sim_diversity_index           -1.452e-02  9.720e-03  -1.494  0.13540    
## pop_density                   -2.260e-07  1.219e-06  -0.185  0.85293    
## non_hispanic_african_american  9.370e-01  8.890e-02  10.540  < 2e-16 ***
## american_indian                1.022e+00  9.224e-02  11.085  < 2e-16 ***
## asian                          7.943e-01  1.034e-01   7.685 2.08e-14 ***
## native_hawaiian                1.482e+00  2.965e-01   4.999 6.10e-07 ***
## hispanic                       8.800e-01  8.620e-02  10.209  < 2e-16 ***
## non_hispanic_white             7.599e-01  9.206e-02   8.255 2.28e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04098 on 2924 degrees of freedom
## Multiple R-squared:  0.4087, Adjusted R-squared:  0.4049 
## F-statistic: 106.4 on 19 and 2924 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(sqrt_norm_model)

summary(sqrt_norm_model)
## 
## Call:
## lm(formula = sqrt(cases_norm) ~ mask_score + ratio_pop_to_physician + 
##     unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + 
##     X._rural + life_expectancy + area + X._democrat + sim_diversity_index + 
##     life_expectancy + pop_density + non_hispanic_african_american + 
##     american_indian + asian + native_hawaiian + hispanic + non_hispanic_white, 
##     data = data_norm)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.145860 -0.025914 -0.001729  0.023545  0.243744 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   -4.430e-01  9.467e-02  -4.679 3.01e-06 ***
## mask_score                    -6.594e-02  2.525e-03 -26.115  < 2e-16 ***
## ratio_pop_to_physician        -1.196e-06  4.217e-07  -2.836  0.00461 ** 
## unemploy_rate                 -6.371e-01  6.406e-02  -9.946  < 2e-16 ***
## HS_grad_rate                   2.301e-02  1.244e-02   1.850  0.06443 .  
## flu_vacc_.                     5.001e-02  1.005e-02   4.975 6.92e-07 ***
## uninsured                     -1.500e-01  2.592e-02  -5.788 7.89e-09 ***
## median_income                 -4.417e-07  9.907e-08  -4.458 8.58e-06 ***
## X._rural                      -3.022e-02  3.651e-03  -8.277  < 2e-16 ***
## life_expectancy                1.710e-03  4.031e-04   4.242 2.29e-05 ***
## area                          -3.602e-06  6.563e-07  -5.487 4.43e-08 ***
## X._democrat                   -6.849e-02  9.510e-03  -7.202 7.53e-13 ***
## sim_diversity_index           -1.452e-02  9.720e-03  -1.494  0.13540    
## pop_density                   -2.260e-07  1.219e-06  -0.185  0.85293    
## non_hispanic_african_american  9.370e-01  8.890e-02  10.540  < 2e-16 ***
## american_indian                1.022e+00  9.224e-02  11.085  < 2e-16 ***
## asian                          7.943e-01  1.034e-01   7.685 2.08e-14 ***
## native_hawaiian                1.482e+00  2.965e-01   4.999 6.10e-07 ***
## hispanic                       8.800e-01  8.620e-02  10.209  < 2e-16 ***
## non_hispanic_white             7.599e-01  9.206e-02   8.255 2.28e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04098 on 2924 degrees of freedom
## Multiple R-squared:  0.4087, Adjusted R-squared:  0.4049 
## F-statistic: 106.4 on 19 and 2924 DF,  p-value: < 2.2e-16
#print(xtable(as.data.frame(sqrt_norm_model$coefficients), type = "latex"))

r2_normal <- function(model){
  test <- anova(model)
  sse <- test[,2]
  names <- rownames(test)
  total_sse <- sum(sse)
  output <- cbind(names,sse/total_sse)
  colnames(output) <- c("variables", "r2")
return(output)
}

#print(r2_normal(sqrt_norm_model))

Using Regsubsets

?regsubsets
sub_set = regsubsets(sqrt(cases_norm) ~ (mask_score + ratio_pop_to_physician + unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + pop_size + X._rural + life_expectancy + sim_diversity_index + area + X._democrat + life_expectancy + pop_density + non_hispanic_african_american + american_indian + asian + native_hawaiian + hispanic + non_hispanic_white)^2, data=data.t, nvmax=10, nbest=6, method='forward')
res.sum <- summary(sub_set)


summary(res.sum,all.best=TRUE,matrix=T,matrix.logical=F,df=NULL)
##        Length Class      Mode     
## which  12660  -none-     logical  
## rsq       60  -none-     numeric  
## rss       60  -none-     numeric  
## adjr2     60  -none-     numeric  
## cp        60  -none-     numeric  
## bic       60  -none-     numeric  
## outmat 12600  -none-     character
## obj       28  regsubsets list
#max(res.sum$adjr2)
#res.sum$outmat[60,]
res.sum$rsq[60]
## [1] 0.3931536
res.sum$bic[60]
## [1] -654.9958
res.sum$adjr2[60]
## [1] 0.389
res.sum$cp[60]
## [1] 397.9193
labels(which(res.sum$outmat[60,]=='*'))
##  [1] "mask_score"                       "mask_score:unemploy_rate"        
##  [3] "ratio_pop_to_physician:uninsured" "unemploy_rate:X._democrat"       
##  [5] "unemploy_rate:non_hispanic_white" "flu_vacc_.:hispanic"             
##  [7] "median_income:asian"              "X._rural:asian"                  
##  [9] "X._rural:hispanic"                "area:non_hispanic_white"
sub_set = regsubsets(sqrt(cases_norm) ~ (mask_score + ratio_pop_to_physician + unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + pop_size + X._rural + life_expectancy + sim_diversity_index + area + X._democrat + life_expectancy + pop_density + non_hispanic_african_american + american_indian + asian + native_hawaiian + hispanic + non_hispanic_white)^2, data=data.t, nvmax=10, nbest=6, method='backward')
## Warning in warn.extra(rval): model with initial (1) variables was better, and is
## reported
res.sum <- summary(sub_set)


summary(res.sum,all.best=TRUE,matrix=T,matrix.logical=F,df=NULL)
##        Length Class      Mode     
## which  12027  -none-     logical  
## rsq       57  -none-     numeric  
## rss       57  -none-     numeric  
## adjr2     57  -none-     numeric  
## cp        57  -none-     numeric  
## bic       57  -none-     numeric  
## outmat 11970  -none-     character
## obj       28  regsubsets list
#max(res.sum$adjr2)
#res.sum$outmat[60,]
res.sum$rsq[57]
## [1] 0.364777
res.sum$bic[57]
## [1] -587.7248
res.sum$adjr2[57]
## [1] 0.3604291
res.sum$cp[57]
## [1] 484.3295
  • Comparing the results from forward and backwards stepwise selection the model produced by the forward selection produces preferable statistics with regards to AIC and \[R^2_a\] as seen in the results supplied in appendix 2.
#final_model = lm(sqrt(cases_norm) ~ mask_score:unemploy_rate + mask_score + mask_score:median_income + unemploy_rate:non_hispanic_white + X._rural:X._democrat, data = data_norm)
#final_model = lm(sqrt(cases_norm) ~ mask_score + unemploy_rate:X._rural + uninsured:non_hispanic_white + area:american_indian + mask_score:unemploy_rate + unemploy_rate:X._democrat + median_income:X._rural + ratio_pop_to_physician:asian + unemploy_rate:non_hispanic_white + X._rural:area, data = data.t)


final_model = lm(sqrt(cases_norm) ~ mask_score+mask_score:unemploy_rate + ratio_pop_to_physician:uninsured + ratio_pop_to_physician:uninsured + unemploy_rate:X._democrat + unemploy_rate:non_hispanic_white + flu_vacc_.:hispanic + median_income:asian + X._rural:asian + X._rural:hispanic + area:non_hispanic_white, data = data.t)


summary(final_model)
## 
## Call:
## lm(formula = sqrt(cases_norm) ~ mask_score + mask_score:unemploy_rate + 
##     ratio_pop_to_physician:uninsured + ratio_pop_to_physician:uninsured + 
##     unemploy_rate:X._democrat + unemploy_rate:non_hispanic_white + 
##     flu_vacc_.:hispanic + median_income:asian + X._rural:asian + 
##     X._rural:hispanic + area:non_hispanic_white, data = data.t)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.141643 -0.026402 -0.002281  0.023550  0.246229 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       5.086e-01  1.163e-02  43.740  < 2e-16 ***
## mask_score                       -8.594e-02  3.947e-03 -21.772  < 2e-16 ***
## mask_score:unemploy_rate          5.169e-01  6.719e-02   7.694 2.60e-14 ***
## ratio_pop_to_physician:uninsured -3.006e-05  5.079e-06  -5.919 4.03e-09 ***
## unemploy_rate:X._democrat        -1.143e+00  2.389e-01  -4.786 1.88e-06 ***
## unemploy_rate:non_hispanic_white -2.568e+00  1.802e-01 -14.256  < 2e-16 ***
## flu_vacc_.:hispanic               1.194e-01  3.869e-02   3.088 0.002055 ** 
## median_income:asian              -2.088e-06  5.341e-07  -3.910 9.65e-05 ***
## asian:X._rural                   -1.085e+00  2.894e-01  -3.748 0.000185 ***
## hispanic:X._rural                -1.165e-01  2.289e-02  -5.089 4.07e-07 ***
## non_hispanic_white:area          -6.762e-06  1.286e-06  -5.258 1.68e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04204 on 1461 degrees of freedom
## Multiple R-squared:  0.3932, Adjusted R-squared:  0.389 
## F-statistic: 94.65 on 10 and 1461 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(final_model)

final_model$coefficients
##                      (Intercept)                       mask_score 
##                     5.086201e-01                    -8.593599e-02 
##         mask_score:unemploy_rate ratio_pop_to_physician:uninsured 
##                     5.169467e-01                    -3.005965e-05 
##        unemploy_rate:X._democrat unemploy_rate:non_hispanic_white 
##                    -1.143084e+00                    -2.568189e+00 
##              flu_vacc_.:hispanic              median_income:asian 
##                     1.194486e-01                    -2.088325e-06 
##                   asian:X._rural                hispanic:X._rural 
##                    -1.084744e+00                    -1.164722e-01 
##          non_hispanic_white:area 
##                    -6.762233e-06

Model Diagnostics

get_model_stats <- function(cand_model, full_model, data.t, data.v){

  
  adj_r2_train <- summary(cand_model)$adj.r.squared
  
  
  n <- nrow(data.t)
  sse_fs1 <- anova(cand_model)["Residuals", 2]     #SSE
  mse_fs1 <- anova(cand_model)["Residuals", 3]     #MSE
  mse_full <- anova(full_model)["Residuals", 3] #MSE  for full model
  
  p_fs1 <- length(cand_model$coefficients)          #number of coefficients
  Cp_fs1 <- sse_fs1/mse_full - (n - 2*p_fs1)
  
  aic_fs1 <- n*log(sse_fs1/n) + 2*p_fs1
  
  e.fs1=cand_model$residuals                   # residuals
  h.fs1=influence(cand_model)$hat                 # diagonals of hat matrix
  press.fs1= sum(e.fs1^2/(1-h.fs1)^2)            # calculate pressP
  
  
  #Get MSPE_v from new data for model 1
  newdata = data.v[, 1:27]
  y.hat = predict(cand_model, newdata)
    
  MSPE = mean((data.v$cases_norm- y.hat)^2)
  sse_t = sum(cand_model$residuals^2)
  
  output <- c(Cp_fs1, p_fs1, aic_fs1, press.fs1, adj_r2_train, MSPE, sse_t/nrow(data.t), press.fs1/nrow(data.t))

return(output)
}
# train models with more variables...

none_mod = lm(sqrt(cases_norm)~1, data=data.t)
model_full_2order = lm(sqrt(cases_norm)~(mask_score + ratio_pop_to_physician + unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + pop_size + X._rural + life_expectancy + sim_diversity_index + area + X._democrat + life_expectancy + non_hispanic_african_american + american_indian + asian + native_hawaiian + hispanic + non_hispanic_white)^2, data=data.t)

full_mod = lm(sqrt(cases_norm)~(mask_score + ratio_pop_to_physician + unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + pop_size + X._rural + life_expectancy + sim_diversity_index + area + X._democrat + life_expectancy + pop_density + non_hispanic_african_american + american_indian + asian + native_hawaiian + hispanic + non_hispanic_white)^2, data=data_norm)

model_AIC_2order_3 <- stepAIC(none_mod, scope=list(upper=full_mod, lower = ~1), direction="forward", k=2, trace = FALSE, steps = 3)
model_AIC_2order_5 <- stepAIC(none_mod, scope=list(upper=full_mod, lower = ~1), direction="forward", k=2, trace = FALSE, steps = 5)
model_AIC_2order_7 <- stepAIC(none_mod, scope=list(upper=full_mod, lower = ~1), direction="forward", k=2, trace = FALSE, steps = 7)
model_AIC_2order_10 <- stepAIC(none_mod, scope=list(upper=full_mod, lower = ~1), direction="forward", k=2, trace = FALSE, steps = 10)
model_AIC_2order_15 <- stepAIC(none_mod, scope=list(upper=full_mod, lower = ~1), direction="forward", k=2, trace = FALSE, steps = 15)
model_AIC_2order_20 <- stepAIC(none_mod, scope=list(upper=full_mod, lower = ~1), direction="forward", k=2, trace = FALSE, steps = 20)
model_AIC_2order_25 <- stepAIC(none_mod, scope=list(upper=full_mod, lower = ~1), direction="forward", k=2, trace = FALSE, steps = 25)
model_AIC_2order_30 <- stepAIC(none_mod, scope=list(upper=full_mod, lower = ~1), direction="forward", k=2, trace = FALSE, steps = 30)
?regsubsets
a <- get_model_stats(model_AIC_2order_10, model_full_2order, data.t, data.v)
b <- get_model_stats(model_AIC_2order_15, model_full_2order, data.t, data.v)
c <- get_model_stats(model_AIC_2order_20, model_full_2order, data.t, data.v)
d <- get_model_stats(model_AIC_2order_25, model_full_2order, data.t, data.v)
e <- get_model_stats(model_AIC_2order_30, model_full_2order, data.t, data.v)
f <- get_model_stats(model_full_2order, model_full_2order, data.t, data.v)

x <- get_model_stats(model_AIC_2order_3, model_full_2order, data.t, data.v)
y <- get_model_stats(model_AIC_2order_5, model_full_2order, data.t, data.v)
z <- get_model_stats(model_AIC_2order_7, model_full_2order, data.t, data.v)

r <- get_model_stats(final_model, model_full_2order, data.t, data.v)
compare_models <- rbind(x,y,z,a,b,c,d,e,f,r)
rownames(compare_models) <- c("3 var","5 var","7 var","10 var","15 var","20 var","25 var","30 var","full model","regsubsetsmodel")
colnames(compare_models) <- c("Cp", "p", "aic", "pressP", "adj_r2", "MSPE", "sse/n", "pressP/n")
as.data.frame(compare_models)
##                       Cp   p       aic   pressP    adj_r2       MSPE
## 3 var           601.8137   4 -9171.939 2.898576 0.3217763 0.02476102
## 5 var           532.8505   6 -9220.870 2.806522 0.3448381 0.02486011
## 7 var           494.8167   8 -9248.250 2.757549 0.3577809 0.02493503
## 10 var          440.1028  11 -9288.790 2.686792 0.3764905 0.02496562
## 15 var          344.3974  16 -9363.504 2.548177 0.4093393 0.02488677
## 20 var          272.2915  21 -9422.843 2.460923 0.4345758 0.02507176
## 25 var          239.8135  26 -9450.041 2.430395 0.4467787 0.02510464
## 30 var          202.3095  31 -9482.784 2.379926 0.4607474 0.02507898
## full model      191.0000 191 -9501.383 3.272378 0.5180435 0.02552738
## regsubsetsmodel 402.1817  11 -9318.623 2.632862 0.3890000 0.02520130
##                       sse/n    pressP/n
## 3 var           0.001956943 0.001969141
## 5 var           0.001887825 0.001906604
## 7 var           0.001848006 0.001873335
## 10 var          0.001790492 0.001825266
## 15 var          0.001690358 0.001731099
## 20 var          0.001612579 0.001671823
## 25 var          0.001572340 0.001651083
## 30 var          0.001527339 0.001616798
## full model      0.001213490 0.002223083
## regsubsetsmodel 0.001754570 0.001788629
library(xtable)
?print.xtable
print(xtable(as.data.frame(compare_models), type = "latex"))
## % latex table generated in R 4.0.3 by xtable 1.8-4 package
## % Sun Dec 13 17:47:07 2020
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrrrrrrr}
##   \hline
##  & Cp & p & aic & pressP & adj\_r2 & MSPE & sse/n & pressP/n \\ 
##   \hline
## 3 var & 601.81 & 4.00 & -9171.94 & 2.90 & 0.32 & 0.02 & 0.00 & 0.00 \\ 
##   5 var & 532.85 & 6.00 & -9220.87 & 2.81 & 0.34 & 0.02 & 0.00 & 0.00 \\ 
##   7 var & 494.82 & 8.00 & -9248.25 & 2.76 & 0.36 & 0.02 & 0.00 & 0.00 \\ 
##   10 var & 440.10 & 11.00 & -9288.79 & 2.69 & 0.38 & 0.02 & 0.00 & 0.00 \\ 
##   15 var & 344.40 & 16.00 & -9363.50 & 2.55 & 0.41 & 0.02 & 0.00 & 0.00 \\ 
##   20 var & 272.29 & 21.00 & -9422.84 & 2.46 & 0.43 & 0.03 & 0.00 & 0.00 \\ 
##   25 var & 239.81 & 26.00 & -9450.04 & 2.43 & 0.45 & 0.03 & 0.00 & 0.00 \\ 
##   30 var & 202.31 & 31.00 & -9482.78 & 2.38 & 0.46 & 0.03 & 0.00 & 0.00 \\ 
##   full model & 191.00 & 191.00 & -9501.38 & 3.27 & 0.52 & 0.03 & 0.00 & 0.00 \\ 
##   regsubsetsmodel & 402.18 & 11.00 & -9318.62 & 2.63 & 0.39 & 0.03 & 0.00 & 0.00 \\ 
##    \hline
## \end{tabular}
## \end{table}
plot(as.data.frame(compare_models)$aic)

plot(as.data.frame(compare_models)$Cp)

plot(as.data.frame(compare_models)$adj_r2)

final_model = lm(sqrt(cases_norm) ~ mask_score+mask_score:unemploy_rate + ratio_pop_to_physician:uninsured + ratio_pop_to_physician:uninsured + unemploy_rate:X._democrat + unemploy_rate:non_hispanic_white + flu_vacc_.:hispanic + median_income:asian + X._rural:asian + X._rural:hispanic + area:non_hispanic_white, data = data_norm)

r2_normal <- function(model){
  test <- anova(model)
  sse <- test[,2]
  names <- rownames(test)
  total_sse <- sum(sse)
  output <- cbind(names,sse/total_sse)
  colnames(output) <- c("variables", "r2")
return(output)
}


summary(final_model)
## 
## Call:
## lm(formula = sqrt(cases_norm) ~ mask_score + mask_score:unemploy_rate + 
##     ratio_pop_to_physician:uninsured + ratio_pop_to_physician:uninsured + 
##     unemploy_rate:X._democrat + unemploy_rate:non_hispanic_white + 
##     flu_vacc_.:hispanic + median_income:asian + X._rural:asian + 
##     X._rural:hispanic + area:non_hispanic_white, data = data_norm)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.136577 -0.026452 -0.001916  0.023980  0.246871 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       4.982e-01  7.644e-03  65.168  < 2e-16 ***
## mask_score                       -8.319e-02  2.580e-03 -32.251  < 2e-16 ***
## mask_score:unemploy_rate          4.865e-01  4.202e-02  11.579  < 2e-16 ***
## ratio_pop_to_physician:uninsured -1.383e-05  2.511e-06  -5.509 3.93e-08 ***
## unemploy_rate:X._democrat        -1.118e+00  1.560e-01  -7.164 9.87e-13 ***
## unemploy_rate:non_hispanic_white -2.538e+00  1.167e-01 -21.755  < 2e-16 ***
## flu_vacc_.:hispanic               1.175e-01  2.431e-02   4.835 1.40e-06 ***
## median_income:asian              -1.785e-06  3.720e-07  -4.799 1.68e-06 ***
## asian:X._rural                   -7.132e-01  1.968e-01  -3.625 0.000294 ***
## hispanic:X._rural                -1.309e-01  1.581e-02  -8.275  < 2e-16 ***
## non_hispanic_white:area          -5.858e-06  8.639e-07  -6.781 1.43e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04127 on 2933 degrees of freedom
## Multiple R-squared:  0.3985, Adjusted R-squared:  0.3964 
## F-statistic: 194.3 on 10 and 2933 DF,  p-value: < 2.2e-16
df_final_model = as.data.frame(final_model$coefficients)
df_final_model$r2 = c('NA',r2_normal(final_model)[,2][1:10])
df_final_model
##                                  final_model$coefficients                   r2
## (Intercept)                                  4.981737e-01                   NA
## mask_score                                  -8.319280e-02    0.181703380784454
## mask_score:unemploy_rate                     4.865244e-01  0.00281206648342403
## ratio_pop_to_physician:uninsured            -1.383041e-05 2.14400529371138e-05
## unemploy_rate:X._democrat                   -1.117925e+00   0.0431924280537644
## unemploy_rate:non_hispanic_white            -2.537833e+00    0.136023534260933
## flu_vacc_.:hispanic                          1.175145e-01 6.98851182152494e-08
## median_income:asian                         -1.785194e-06  0.00311190366257682
## asian:X._rural                              -7.132081e-01  0.00750359975974371
## hispanic:X._rural                           -1.308632e-01   0.0146790170369514
## non_hispanic_white:area                     -5.858479e-06  0.00943166450959449
print(xtable(as.data.frame(df_final_model), type = "latex"))
## % latex table generated in R 4.0.3 by xtable 1.8-4 package
## % Sun Dec 13 17:47:07 2020
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrl}
##   \hline
##  & final\_model\$coefficients & r2 \\ 
##   \hline
## (Intercept) & 0.50 & NA \\ 
##   mask\_score & -0.08 & 0.181703380784454 \\ 
##   mask\_score:unemploy\_rate & 0.49 & 0.00281206648342403 \\ 
##   ratio\_pop\_to\_physician:uninsured & -0.00 & 2.14400529371138e-05 \\ 
##   unemploy\_rate:X.\_democrat & -1.12 & 0.0431924280537644 \\ 
##   unemploy\_rate:non\_hispanic\_white & -2.54 & 0.136023534260933 \\ 
##   flu\_vacc\_.:hispanic & 0.12 & 6.98851182152494e-08 \\ 
##   median\_income:asian & -0.00 & 0.00311190366257682 \\ 
##   asian:X.\_rural & -0.71 & 0.00750359975974371 \\ 
##   hispanic:X.\_rural & -0.13 & 0.0146790170369514 \\ 
##   non\_hispanic\_white:area & -0.00 & 0.00943166450959449 \\ 
##    \hline
## \end{tabular}
## \end{table}

Model Outliers

Obtain the studentized deleted residuals and identify any outlying Y observations. Use the Bonferroni outlier test procedure at α = 0.05.

final_model = lm(sqrt(cases_norm) ~ mask_score+mask_score:unemploy_rate + ratio_pop_to_physician:uninsured + ratio_pop_to_physician:uninsured + unemploy_rate:X._democrat + unemploy_rate:non_hispanic_white + flu_vacc_.:hispanic + median_income:asian + X._rural:asian + X._rural:hispanic + area:non_hispanic_white, data = data_norm)
n=length(data_norm)
p=11
rsd.lm=round(rstudent(final_model), 3)
result = ifelse(rsd.lm > qt(1-0.95/2/n,n-p-1), TRUE, FALSE)
table(result)
## result
## FALSE  TRUE 
##  2901    43
outliers = names(which(result))
outliers
##  [1] "71"   "116"  "121"  "122"  "227"  "316"  "376"  "477"  "548"  "552" 
## [11] "748"  "761"  "790"  "903"  "969"  "1002" "1014" "1076" "1196" "1211"
## [21] "1222" "1226" "1240" "1309" "1538" "1581" "1603" "1648" "1906" "2210"
## [31] "2212" "2230" "2245" "2256" "2315" "2352" "2358" "2414" "2577" "2868"
## [41] "2875" "2880" "2928"
final_model = lm(sqrt(cases_norm) ~ mask_score+mask_score:unemploy_rate + ratio_pop_to_physician:uninsured + ratio_pop_to_physician:uninsured + unemploy_rate:X._democrat + unemploy_rate:non_hispanic_white + flu_vacc_.:hispanic + median_income:asian + X._rural:asian + X._rural:hispanic + area:non_hispanic_white, data = data_norm)



par(mfrow = c(2,2))
plot(final_model)

plot(final_model)

#data_norm
#data_norm[-c(2577),]
colnames(data_norm)
##  [1] "X.1"                           "X"                            
##  [3] "fips"                          "county"                       
##  [5] "state"                         "cases"                        
##  [7] "deaths"                        "mask_score"                   
##  [9] "ratio_pop_to_physician"        "unemploy_rate"                
## [11] "HS_grad_rate"                  "flu_vacc_."                   
## [13] "uninsured"                     "median_income"                
## [15] "pop_size"                      "X._rural"                     
## [17] "life_expectancy"               "non_hispanic_african_american"
## [19] "american_indian"               "asian"                        
## [21] "native_hawaiian"               "hispanic"                     
## [23] "non_hispanic_white"            "sim_diversity_index"          
## [25] "X._democrat"                   "area"                         
## [27] "pop_density"                   "cases_norm"
for (i in colnames(data_norm)){
  print(i)
  print(summary(eval(call('$', data_norm, i))))
}
## [1] "X.1"
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0   761.8  1506.5  1539.2  2330.2  3109.0 
## [1] "X"
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     3.0   842.8  1598.5  1633.3  2442.5  3234.0 
## [1] "fips"
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1001   18182   29100   30355   45319   56045 
## [1] "county"
##    Length     Class      Mode 
##      2944 character character 
## [1] "state"
##    Length     Class      Mode 
##      2944 character character 
## [1] "cases"
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     11.0    469.5   1101.5   4073.6   2758.5 370729.0 
## [1] "deaths"
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    6.00   17.00   78.48   48.00 7446.00 
## [1] "mask_score"
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.433   2.701   3.000   2.995   3.296   3.849 
## [1] "ratio_pop_to_physician"
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -3936    1048    1498    1916    2202   51307 
## [1] "unemploy_rate"
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01624 0.03567 0.04388 0.04619 0.05365 0.19066 
## [1] "HS_grad_rate"
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3630  0.8467  0.8951  0.8838  0.9344  1.0000 
## [1] "flu_vacc_."
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0700  0.3500  0.4200  0.4087  0.4800  0.6500 
## [1] "uninsured"
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02068 0.07036 0.10272 0.10937 0.13841 0.30877 
## [1] "median_income"
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   24783   42208   48728   50996   56546  136191 
## [1] "pop_size"
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     1718    13074    28270   107303    73641 10163507 
## [1] "X._rural"
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.3222  0.5739  0.5678  0.8149  1.0000 
## [1] "life_expectancy"
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   66.50   75.49   77.48   77.40   79.22   97.97 
## [1] "non_hispanic_african_american"
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0005099 0.0073038 0.0244137 0.0925949 0.1061252 0.8532961 
## [1] "american_indian"
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000496 0.003656 0.005952 0.019425 0.012304 0.879781 
## [1] "asian"
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.004604 0.007189 0.015009 0.014113 0.430067 
## [1] "native_hawaiian"
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000000 0.0003171 0.0005949 0.0012039 0.0011243 0.1312001 
## [1] "hispanic"
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.005151 0.022798 0.042757 0.094767 0.096909 0.963230 
## [1] "non_hispanic_white"
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0276  0.6470  0.8371  0.7647  0.9250  0.9792 
## [1] "sim_diversity_index"
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.04102 0.14154 0.28553 0.31371 0.48660 0.82897 
## [1] "X._democrat"
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.06955 0.21860 0.30358 0.33789 0.42621 0.95695 
## [1] "area"
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.0   426.7   599.9   946.6   900.2 20056.9 
## [1] "pop_density"
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     0.50    20.98    48.40   214.13   123.85 17179.10 
## [1] "cases_norm"
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00140 0.02720 0.04001 0.04221 0.05357 0.19353
data_norm["2577",]
##       X.1    X  fips county state cases deaths mask_score
## 2577 2727 2844 48473 Waller Texas  1113     18       3.38
##      ratio_pop_to_physician unemploy_rate HS_grad_rate flu_vacc_. uninsured
## 2577                  51307     0.0486915        0.947       0.42 0.2385323
##      median_income pop_size  X._rural life_expectancy
## 2577         55224    51307 0.6164101        79.05264
##      non_hispanic_african_american american_indian      asian native_hawaiian
## 2577                     0.2453856      0.01403317 0.01075877     0.001052488
##       hispanic non_hispanic_white sim_diversity_index X._democrat   area
## 2577 0.3012844          0.4287719           0.6648545   0.3530929 513.43
##      pop_density cases_norm
## 2577        84.1 0.02169295
final_model = lm(sqrt(cases_norm) ~ mask_score+mask_score:unemploy_rate + ratio_pop_to_physician:uninsured + ratio_pop_to_physician:uninsured + unemploy_rate:X._democrat + unemploy_rate:non_hispanic_white + flu_vacc_.:hispanic + median_income:asian + X._rural:asian + X._rural:hispanic + area:non_hispanic_white, data = data_norm)

final_model_no_outlier = lm(sqrt(cases_norm) ~ mask_score+mask_score:unemploy_rate + ratio_pop_to_physician:uninsured + ratio_pop_to_physician:uninsured + unemploy_rate:X._democrat + unemploy_rate:non_hispanic_white + flu_vacc_.:hispanic + median_income:asian + X._rural:asian + X._rural:hispanic + area:non_hispanic_white, data = data_norm[-c(2577),])


a <- final_model$fitted.value
b <- predict(final_model_no_outlier, data_norm)
#a
#b
plot(a, b, xlab="fitted values using all cases", ylab="fitted values without outliers") ## compare fitted values
abline(0,1)

mean(abs(a-b)/a*100)
## [1] 0.7263656
par(mfrow = c(2,2))
plot(final_model)

plot(final_model_no_outlier)

summary(final_model_no_outlier)
## 
## Call:
## lm(formula = sqrt(cases_norm) ~ mask_score + mask_score:unemploy_rate + 
##     ratio_pop_to_physician:uninsured + ratio_pop_to_physician:uninsured + 
##     unemploy_rate:X._democrat + unemploy_rate:non_hispanic_white + 
##     flu_vacc_.:hispanic + median_income:asian + X._rural:asian + 
##     X._rural:hispanic + area:non_hispanic_white, data = data_norm[-c(2577), 
##     ])
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.138711 -0.026040 -0.001925  0.024246  0.249273 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       5.033e-01  7.744e-03  64.986  < 2e-16 ***
## mask_score                       -8.406e-02  2.584e-03 -32.533  < 2e-16 ***
## mask_score:unemploy_rate          5.126e-01  4.248e-02  12.066  < 2e-16 ***
## ratio_pop_to_physician:uninsured -2.368e-05  3.608e-06  -6.561 6.29e-11 ***
## unemploy_rate:X._democrat        -1.210e+00  1.576e-01  -7.681 2.14e-14 ***
## unemploy_rate:non_hispanic_white -2.608e+00  1.179e-01 -22.129  < 2e-16 ***
## flu_vacc_.:hispanic               1.145e-01  2.426e-02   4.719 2.48e-06 ***
## median_income:asian              -1.837e-06  3.714e-07  -4.947 7.97e-07 ***
## asian:X._rural                   -7.141e-01  1.963e-01  -3.637  0.00028 ***
## hispanic:X._rural                -1.275e-01  1.580e-02  -8.066 1.05e-15 ***
## non_hispanic_white:area          -6.062e-06  8.636e-07  -7.020 2.75e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04118 on 2932 degrees of freedom
## Multiple R-squared:  0.4012, Adjusted R-squared:  0.3992 
## F-statistic: 196.5 on 10 and 2932 DF,  p-value: < 2.2e-16
boxplot(final_model_no_outlier$coefficients-final_model$coefficients)

summary(final_model)
## 
## Call:
## lm(formula = sqrt(cases_norm) ~ mask_score + mask_score:unemploy_rate + 
##     ratio_pop_to_physician:uninsured + ratio_pop_to_physician:uninsured + 
##     unemploy_rate:X._democrat + unemploy_rate:non_hispanic_white + 
##     flu_vacc_.:hispanic + median_income:asian + X._rural:asian + 
##     X._rural:hispanic + area:non_hispanic_white, data = data_norm)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.136577 -0.026452 -0.001916  0.023980  0.246871 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       4.982e-01  7.644e-03  65.168  < 2e-16 ***
## mask_score                       -8.319e-02  2.580e-03 -32.251  < 2e-16 ***
## mask_score:unemploy_rate          4.865e-01  4.202e-02  11.579  < 2e-16 ***
## ratio_pop_to_physician:uninsured -1.383e-05  2.511e-06  -5.509 3.93e-08 ***
## unemploy_rate:X._democrat        -1.118e+00  1.560e-01  -7.164 9.87e-13 ***
## unemploy_rate:non_hispanic_white -2.538e+00  1.167e-01 -21.755  < 2e-16 ***
## flu_vacc_.:hispanic               1.175e-01  2.431e-02   4.835 1.40e-06 ***
## median_income:asian              -1.785e-06  3.720e-07  -4.799 1.68e-06 ***
## asian:X._rural                   -7.132e-01  1.968e-01  -3.625 0.000294 ***
## hispanic:X._rural                -1.309e-01  1.581e-02  -8.275  < 2e-16 ***
## non_hispanic_white:area          -5.858e-06  8.639e-07  -6.781 1.43e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04127 on 2933 degrees of freedom
## Multiple R-squared:  0.3985, Adjusted R-squared:  0.3964 
## F-statistic: 194.3 on 10 and 2933 DF,  p-value: < 2.2e-16
df_final_model = as.data.frame(final_model$coefficients)
df_final_model$r2 = c('NA',r2_normal(final_model)[,2][1:10])
df_final_model
##                                  final_model$coefficients                   r2
## (Intercept)                                  4.981737e-01                   NA
## mask_score                                  -8.319280e-02    0.181703380784454
## mask_score:unemploy_rate                     4.865244e-01  0.00281206648342403
## ratio_pop_to_physician:uninsured            -1.383041e-05 2.14400529371138e-05
## unemploy_rate:X._democrat                   -1.117925e+00   0.0431924280537644
## unemploy_rate:non_hispanic_white            -2.537833e+00    0.136023534260933
## flu_vacc_.:hispanic                          1.175145e-01 6.98851182152494e-08
## median_income:asian                         -1.785194e-06  0.00311190366257682
## asian:X._rural                              -7.132081e-01  0.00750359975974371
## hispanic:X._rural                           -1.308632e-01   0.0146790170369514
## non_hispanic_white:area                     -5.858479e-06  0.00943166450959449
#print(xtable(as.data.frame(df_final_model), type = "latex"))
summary(final_model_no_outlier)
## 
## Call:
## lm(formula = sqrt(cases_norm) ~ mask_score + mask_score:unemploy_rate + 
##     ratio_pop_to_physician:uninsured + ratio_pop_to_physician:uninsured + 
##     unemploy_rate:X._democrat + unemploy_rate:non_hispanic_white + 
##     flu_vacc_.:hispanic + median_income:asian + X._rural:asian + 
##     X._rural:hispanic + area:non_hispanic_white, data = data_norm[-c(2577), 
##     ])
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.138711 -0.026040 -0.001925  0.024246  0.249273 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       5.033e-01  7.744e-03  64.986  < 2e-16 ***
## mask_score                       -8.406e-02  2.584e-03 -32.533  < 2e-16 ***
## mask_score:unemploy_rate          5.126e-01  4.248e-02  12.066  < 2e-16 ***
## ratio_pop_to_physician:uninsured -2.368e-05  3.608e-06  -6.561 6.29e-11 ***
## unemploy_rate:X._democrat        -1.210e+00  1.576e-01  -7.681 2.14e-14 ***
## unemploy_rate:non_hispanic_white -2.608e+00  1.179e-01 -22.129  < 2e-16 ***
## flu_vacc_.:hispanic               1.145e-01  2.426e-02   4.719 2.48e-06 ***
## median_income:asian              -1.837e-06  3.714e-07  -4.947 7.97e-07 ***
## asian:X._rural                   -7.141e-01  1.963e-01  -3.637  0.00028 ***
## hispanic:X._rural                -1.275e-01  1.580e-02  -8.066 1.05e-15 ***
## non_hispanic_white:area          -6.062e-06  8.636e-07  -7.020 2.75e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04118 on 2932 degrees of freedom
## Multiple R-squared:  0.4012, Adjusted R-squared:  0.3992 
## F-statistic: 196.5 on 10 and 2932 DF,  p-value: < 2.2e-16
df_final_model = as.data.frame(final_model_no_outlier$coefficients)
df_final_model$r2 = c('NA',r2_normal(final_model_no_outlier)[,2][1:10])
df_final_model
##                                  final_model_no_outlier$coefficients
## (Intercept)                                             5.032552e-01
## mask_score                                             -8.405843e-02
## mask_score:unemploy_rate                                5.126312e-01
## ratio_pop_to_physician:uninsured                       -2.367590e-05
## unemploy_rate:X._democrat                              -1.210433e+00
## unemploy_rate:non_hispanic_white                       -2.608188e+00
## flu_vacc_.:hispanic                                     1.145099e-01
## median_income:asian                                    -1.837365e-06
## asian:X._rural                                         -7.141138e-01
## hispanic:X._rural                                      -1.274679e-01
## non_hispanic_white:area                                -6.062256e-06
##                                                    r2
## (Intercept)                                        NA
## mask_score                           0.18155105239686
## mask_score:unemploy_rate          0.00281273741313736
## ratio_pop_to_physician:uninsured 1.29792732660245e-05
## unemploy_rate:X._democrat          0.0436781444233098
## unemploy_rate:non_hispanic_white    0.138251201500967
## flu_vacc_.:hispanic               1.5217923565925e-06
## median_income:asian               0.00342050329718961
## asian:X._rural                    0.00742418461362043
## hispanic:X._rural                   0.014008053183516
## non_hispanic_white:area            0.0100632438989106
print(xtable(as.data.frame(df_final_model), type = "latex"))
## % latex table generated in R 4.0.3 by xtable 1.8-4 package
## % Sun Dec 13 17:47:11 2020
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrl}
##   \hline
##  & final\_model\_no\_outlier\$coefficients & r2 \\ 
##   \hline
## (Intercept) & 0.50 & NA \\ 
##   mask\_score & -0.08 & 0.18155105239686 \\ 
##   mask\_score:unemploy\_rate & 0.51 & 0.00281273741313736 \\ 
##   ratio\_pop\_to\_physician:uninsured & -0.00 & 1.29792732660245e-05 \\ 
##   unemploy\_rate:X.\_democrat & -1.21 & 0.0436781444233098 \\ 
##   unemploy\_rate:non\_hispanic\_white & -2.61 & 0.138251201500967 \\ 
##   flu\_vacc\_.:hispanic & 0.11 & 1.5217923565925e-06 \\ 
##   median\_income:asian & -0.00 & 0.00342050329718961 \\ 
##   asian:X.\_rural & -0.71 & 0.00742418461362043 \\ 
##   hispanic:X.\_rural & -0.13 & 0.014008053183516 \\ 
##   non\_hispanic\_white:area & -0.00 & 0.0100632438989106 \\ 
##    \hline
## \end{tabular}
## \end{table}